# Alex Gazmararian
# agazmararian@gmail.com

library(tidyverse)
library(tidylog)
library(readxl)
library(here)
library(modelsummary)
library(janitor)

years <- 1983:2024
urls <- paste0("https://www.unionstats.com/state/xls/state_", years, ".xlsx")

# Create local cache directory
cache_dir <- here("data", "input", "state_union")
if (!dir.exists(cache_dir)) {
    dir.create(cache_dir, recursive = TRUE, showWarnings = FALSE)
    message("Created directory: ", cache_dir)
}

# Helper function to download or load from cache
download_or_load <- function(url, year) {
    filename <- paste0("state_", year, ".xlsx")
    local_file <- file.path(cache_dir, filename)
    
    if (file.exists(local_file)) {
        message("Loading cached file for year ", year)
        temp_file <- local_file
    } else {
        message("Downloading data for year ", year, " from: ", url)
        temp_file <- local_file
        download.file(url = url, destfile = temp_file, mode = "wb")
        message("Saved to: ", temp_file)
    }
    
    union <- read_xlsx(temp_file, skip = 2, progress = FALSE)
    union <- clean_names(union)
    union <- subset(union, sector == "Private")
    union <- union %>%
        dplyr::select(state, percent_mem) %>%
        rename(union_mem = percent_mem) 
    union$union_mem <- as.numeric(union$union_mem)
    union$union_mem_med <- as.integer(union$union_mem > median(union$union_mem))
    union$year <- year
    union
}

union <- lapply(seq_along(urls), function(i) {
    download_or_load(urls[i], years[i])
})
union <- bind_rows(union)

# Lag one year
union$year_lag <- union$year + 1

# state abbreviation
union$state.abb <- state.abb[match(union$state, state.name)]

write_csv(union, here("data", "inter", "union_state_processed.csv"))
message("Successfully processed and saved union data to: ", here("data", "inter", "union_state_processed.csv"))